Mineria de datos - Proyecto Final

Librerías

library(tidyverse)
library(rmdformats)
library(stats)
library(cluster)
library(mclust)
library(factoextra)
library(dendextend)
library(DT)
library(purrr)
library(igraph)
library(tidygraph)
library(ggraph)
library(ggpubr)
library(clustertend)
library(fpc)
library(FactoMineR)
library(factoextra)
library(pvclust)
library(cluster.datasets)
library(mltools)
library(data.table)

Datos

file <- "/Users/antony.vargasulead.ac.cr/Mineria de datos/Proyecto Final/Data/Heart.csv"
df <- read.csv(file, sep = ",", dec = ".")
df <- df[,!(names(df) %in% "target")]
DT::datatable(df)

2. Análisis descriptivo

2.1 Datos cualitativos

df_cualitativo <- df[, c("age", "trestbps", "chol", "thalach", "oldpeak")]
head(df_cualitativo)
##   age trestbps chol thalach oldpeak
## 1  52      125  212     168     1.0
## 2  53      140  203     155     3.1
## 3  70      145  174     125     2.6
## 4  61      148  203     161     0.0
## 5  62      138  294     106     1.9
## 6  58      100  248     122     1.0

2.2 Resumen

print("Resumen estadistico")
## [1] "Resumen estadistico"
summary(df_cualitativo)
##       age           trestbps          chol        thalach         oldpeak     
##  Min.   :29.00   Min.   : 94.0   Min.   :126   Min.   : 71.0   Min.   :0.000  
##  1st Qu.:48.00   1st Qu.:120.0   1st Qu.:211   1st Qu.:132.0   1st Qu.:0.000  
##  Median :56.00   Median :130.0   Median :240   Median :152.0   Median :0.800  
##  Mean   :54.43   Mean   :131.6   Mean   :246   Mean   :149.1   Mean   :1.072  
##  3rd Qu.:61.00   3rd Qu.:140.0   3rd Qu.:275   3rd Qu.:166.0   3rd Qu.:1.800  
##  Max.   :77.00   Max.   :200.0   Max.   :564   Max.   :202.0   Max.   :6.200

2.3 Desviaciones estándar

for (i in colnames(df_cualitativo)){
  print(i)
  print((sd(df[, i])))
}
## [1] "age"
## [1] 9.07229
## [1] "trestbps"
## [1] 17.51672
## [1] "chol"
## [1] 51.59251
## [1] "thalach"
## [1] 23.00572
## [1] "oldpeak"
## [1] 1.175053

2.4 Cuartiles en diagrama de cajas

par(mfrow=c(2,3))

boxplot(df_cualitativo$age,
        main = "Boxplot - Age",
        ylab = "Age",
        col = "orange",
        border = "brown",
        horizontal = FALSE,
        notch = TRUE)

boxplot(df_cualitativo$trestbps,
        main = "Boxplot - Trestbps",
        ylab = "Trestbps",
        col = "orange",
        border = "brown",
        horizontal = FALSE,
        notch = TRUE)

boxplot(df_cualitativo$chol,
        main = "Boxplot - Chol",
        ylab = "Chol",
        col = "orange",
        border = "brown",
        horizontal = FALSE,
        notch = TRUE)


boxplot(df_cualitativo$thalach,
        main = "Boxplot - Thalach",
        ylab = "thalach",
        col = "orange",
        border = "brown",
        horizontal = FALSE,
        notch = TRUE)

boxplot(df_cualitativo$oldpeak,
        main = "Boxplot - Oldpeak",
        ylab = "Oldpeak",
        col = "orange",
        border = "brown",
        horizontal = FALSE,
        notch = TRUE)

3. Análisis no Supervisado:

3.1 Análisis de componentes principales

PCA <- PCA(df, graph = FALSE, dim(df)[2])
PCA
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 1025 individuals, described by 13 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

3.1.1 Tabla general

eig.tmp <- PCA$eig

eig.tmp[,2:3]<-eig.tmp[,2:3]/100.

DT::datatable(eig.tmp) %>% 
  formatRound('eigenvalue',2) %>% 
  formatPercentage(c('percentage of variance','cumulative percentage of variance'),2)

3.1.2 Gráfico de sedimentación

ggplot(data = data.frame(prop_varianza_acum = PCA$eig[,3], pc = 1:dim(PCA$eig)[1]),
       aes(x = pc, y = prop_varianza_acum, group = 1)) +
  geom_point() +
  geom_line() +
  theme_bw() +
  labs(x = "Componente principal",
       y = "Prop. varianza explicada acumulada")

3.1.3 Tabla de cosenos cuadrados - individuos

DT::datatable(PCA$ind$cos2) %>%
  formatPercentage(colnames(PCA$ind$cos2),2)

3.1.4 Tabla de contribuciones - individuos

DT::datatable(PCA$ind$contrib) %>% 
  formatRound(colnames(PCA$ind$contrib),3)

3.1.5 Tabla de cosenos cuadrados - variables

DT::datatable(PCA$var$cos2) %>% 
  formatPercentage(colnames(PCA$var$cos2),2)

3.1.6 Tabla de contribuciones - variables

DT::datatable(PCA$var$contrib) %>% 
  formatRound(colnames(PCA$var$contrib),3)

3.1.7 Plano principal - Cosenos cuadrados de individuos

fviz_pca_ind(PCA, col.ind="cos2", select.ind = list(cos2 = 0.60), geom = "point",
   gradient.cols = c("black", "#2E9FDF", "#FC4E07" ), title  = "Cosenos cuadrados - individuos")

3.1.8 Plano principal - Contribución de individuos

fviz_pca_ind(PCA, col.ind="contrib", geom = "point",
   gradient.cols = c("black", "#2E9FDF", "#FC4E07" ), title  = "Ejemplo 1 Contribución",repel = TRUE)

3.1.9 Cículo de correlación - Cosenos cuadrados individuos

fviz_pca_var(PCA, col.var = "cos2",
   gradient.cols = c("black", "blue", "red"),
   ggtheme = theme_minimal())

3.1.10 Cículo de correlación - Contribución individuos

fviz_pca_var(PCA, col.var = "contrib",
   gradient.cols = c("black", "blue", "red"),
   ggtheme = theme_minimal())

### 3.1.11 Correlación entre variables originales y los componentes principales

library(corrplot)
## corrplot 0.90 loaded
corrplot(PCA$var$cor)

3.2 Análisis de correspondencia simple

3.2.1 Aplicación del Análisis de correspondencia simple (ACS)

ACS <- CA(df, graph = TRUE, dim(df)[2])

3.2.2 Valores propios - inercia explicada

fviz_eig(ACS, linecolor = "#FC4E07", 
         barcolor = "#00AFBB", barfill = "#00AFBB")

3.2.3 Plano principal - Cosenos cuadrados de individuos

fviz_ca_row(ACS, select.row = list(cos2 = 0.60), col.row = "cos2", geom = "point",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel = TRUE)

3.2.4 Plano principal - Cosenos cuadrado de variables

fviz_ca_col(ACS, col.col = "cos2", 
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

3.2.4 Gráfico de sobreposición

fviz_ca_biplot(ACS, select.row = list(cos2 = 0.80), repel = TRUE)
## Warning: ggrepel: 150 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

3.4 Sentido de la clusterización en el problema

set.seed(321)
df_scale <- scale(df, center = TRUE, scale = TRUE)

hopkins(data = df_scale, n = nrow(df_scale) - 1)
## $H
## [1] 0.3234038

No tiene mucho sentido la clusterización de este problema, ya que el índice de Hopkins da un resultado de 32.3%, superior al 25%, el cual indica que no tiene mucho sentido realizar la clusterización. Un resultado cercano al 50% indica que del todo no se puede clusterizar y el índice se hacer a ese porcentaje

3.5 Clusterización

3.5.1 Kmeans

km.datos <- kmeans(x = df_scale, centers = 5)

p1 <- fviz_cluster(object = km.datos, data = df_scale,
                   ellipse.type = "norm", geom = "point", main = "Datos iris",
                   stand = FALSE, palette = "jco") +
  theme_bw() + theme(legend.position = "none")

p1

3.5.2 Clustering jerárquico

hc <- hclust(d = dist(x = df_scale, method = "euclidean"),
                               method = "complete")

fviz_dend(x = hc, k = 5, cex = 0.6) +
  labs(title = "Herarchical clustering",
       subtitle = "Distancia euclídea, Lincage complete, K=2")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

3.6 Número óptimo de k clúster

jambu <- fviz_nbclust(x = df_scale, FUNcluster = kmeans, method = "wss", k.max = 15, 
             diss = get_dist(df_scale, method = "euclidean"), nstart = 50)

jambu

Es difícil determinar la cantidad k óptima de clúster, ya que en el Codo de Jambú se nota que la línea no se lográ estabilizarse, sin embargo a criterio, podemos determinar que el mejor número es k = 5, sin embargo no se sabe con certeza.

3.7 Indicadores para la evaluación de los clústeres

3.7.1 Visual Assessment of cluster Tendency (VAT)

dist_data <- dist(df_scale, method = "euclidean")

p2 <- fviz_dist(dist.obj = dist_data, show_labels = FALSE) +
      labs(title = "Datos - Heart") + theme(legend.position = "bottom")

p2